4 data wrangling tasks in R
Add a column to an existing data frame
dataFrame$newColumn <- dataFrame$oldColumn1 + dataFrame$oldColumn2
R's transform() function
DataFrame <- transform(dataFrame, newColumnName = some equation)
companiesData <- transform(companiesData, margin = (profit/revenue) * 100)
R's apply() function
dataFrame$newColumn <- apply(dataFrame, 1, function(x) { . . . } )
apply(companiesData, 1, function(x) sum(x))
apply(companiesData[,c('revenue', 'profit')], 1, function(x) sum(x))
companiesData$sums <- apply(companiesData[,c('revenue', 'profit')], 1, function(x) sum(x))
mapply()
dataFrame$newColumn <- mapply(someFunction, dataFrame$column1, dataFrame$column2, dataFrame$column3)
companiesData$margin <- mapply(profitMargin, companiesData$profit, companiesData$revenue)
dplyr
companiesData <- mutate(companiesData, margin = round((profit/revenue) * 100, 1))
Getting summaries by subgroups of your data
highestMargin <- max(companiesData$margin)
highestMargin <- subset(companiesData, margin==max(margin))
highestProfitMargins <- ddply(companiesData, .(company), summarize, bestMargin = max(margin))
Bonus special case: Grouping by date range
vDates <- as.Date(c("2013-06-01", "2013-07-08", "2013-09-01", "2013-09-15"))
vDates.bymonth <- cut(vDates, breaks = "month")
Sorting your results
companyOrder <- order(companiesData$margin)
companiesOrdered <- companiesData[order(-companiesData$margin),]
Reshaping: Wide to long (and back)
library(reshape2)
longData <- melt(your original data frame, a vector of your category variables)
companiesLong <- melt(companiesData, c("fy", "company"))
companiesLong <- melt(companiesData, id.vars=c("fy", "company"), measure.vars=c("revenue", "profit", "margin"), variable.name="financialCategory", value.name="amount")
companiesWide <- dcast(companiesLong, fy + company ~ financialCategory, value.var="amount")
data-wrangling-cheatsheet